{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# import necessary modules\n",
    "import pandas as pd, numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Part 1 <br /> Reviewing the Basics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 2, 3, 4]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a python list is a basic data type\n",
    "li = [1, 2, 3, 4]\n",
    "li"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 2, 3, 4])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a numpy array is like a list, but faster and more compact\n",
    "ar = np.array([1, 2, 3, 4])\n",
    "ar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 2, 3, 4])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can create a numpy array from an existing list too\n",
    "ar = np.array(li)\n",
    "ar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1\n",
       "1    2\n",
       "2    3\n",
       "3    4\n",
       "dtype: int32"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a pandas series is based on a numpy array - it's fast, compact, and has more functionality\n",
    "se1 = pd.Series(ar)\n",
    "se1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    b\n",
       "2    c\n",
       "3    d\n",
       "dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can create a new Series by passing in a list variable or array\n",
    "# series can contain data types other than just integers\n",
    "se2 = pd.Series(['a', 'b', 'c', 'd'])\n",
    "se2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "the first \"column\" is an index, the second is the series of values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "w    a\n",
       "x    b\n",
       "y    c\n",
       "z    d\n",
       "dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can change a series's index\n",
    "se2.index = ['w', 'x', 'y', 'z']\n",
    "se2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0\n",
       "0  1\n",
       "1  2\n",
       "2  3\n",
       "3  4"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a pandas dataframe is like a table where each column is a series\n",
    "df = pd.DataFrame([1, 2, 3, 4])\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "this is a one-dimensional DataFrame... it's equivalent to a Series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'column1': [1, 2, 3, 4], 'column2': [5, 6, 7, 8]}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a dict can contain multiple lists and label them\n",
    "di1 = {'column1':[1, 2, 3, 4], 'column2':[5, 6, 7, 8]}\n",
    "di1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'column1': [1, 2, 3, 4], 'column2': 0    1\n",
       " 1    2\n",
       " 2    3\n",
       " 3    4\n",
       " dtype: int32}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a dict can also contain multiple lists/series and labels\n",
    "di2 = {'column1':li, 'column2':se1}\n",
    "di2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column1</th>\n",
       "      <th>column2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   column1  column2\n",
       "0        1        5\n",
       "1        2        6\n",
       "2        3        7\n",
       "3        4        8"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a pandas dataframe can contain multiple columns/series\n",
    "# you can create a dataframe by passing in a list, array, series, or dict\n",
    "df = pd.DataFrame(di1)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "this is a two-dimensional DataFrame. Its index contains row labels (0, 1, 2, 3) and its columns are indexed by column labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RangeIndex(start=0, stop=4, step=1)\n",
      "Index([u'column1', u'column2'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# the row labels in the index are accessed by the .index attribute of the DataFrame object\n",
    "print(df.index)\n",
    "\n",
    "# the column labels are accessed by the .columns attribute of the DataFrame object\n",
    "print(df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>column1</th>\n",
       "      <th>column2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>w</th>\n",
       "      <td>NaN</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>x</th>\n",
       "      <td>NaN</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y</th>\n",
       "      <td>NaN</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>z</th>\n",
       "      <td>NaN</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   column1 column2\n",
       "0      1.0     NaN\n",
       "1      2.0     NaN\n",
       "2      3.0     NaN\n",
       "3      4.0     NaN\n",
       "w      NaN       a\n",
       "x      NaN       b\n",
       "y      NaN       c\n",
       "z      NaN       d"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make sure your indices match!\n",
    "di = {'column1':se1, 'column2':se2}\n",
    "df = pd.DataFrame(di)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nan\n",
      "<type 'float'>\n"
     ]
    }
   ],
   "source": [
    "# numpy offers a useful datatype called NaN for null values - has to be floating point, not int\n",
    "x = np.nan\n",
    "print(x)\n",
    "print(type(x))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Part 2: <br /> Working with CSV files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# pandas can load CSV files as DataFrames - it pulls column labels from the first row of the data file\n",
    "df2 = pd.read_csv('data/pandas-test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>california</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phoenix</td>\n",
       "      <td>arizona</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>seattle</td>\n",
       "      <td>washington</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>dallas</td>\n",
       "      <td>texas</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>denver</td>\n",
       "      <td>colorado</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city       state\n",
       "0  san francisco  california\n",
       "1        phoenix     arizona\n",
       "2        seattle  washington\n",
       "3         dallas       texas\n",
       "4         denver    colorado"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can view the first few or the last few rows of a DataFrame with the .head() or .tail() methods\n",
    "df2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>chicago</td>\n",
       "      <td>illinois</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>portland</td>\n",
       "      <td>oregon</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>miami</td>\n",
       "      <td>florida</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       city     state\n",
       "5   chicago  illinois\n",
       "6  portland    oregon\n",
       "7     miami   florida"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# returns 5 rows by default, or you can pass the number of rows you want as an argument\n",
    "df2.tail(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>california</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phoenix</td>\n",
       "      <td>arizona</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>seattle</td>\n",
       "      <td>washington</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>dallas</td>\n",
       "      <td>texas</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>denver</td>\n",
       "      <td>colorado</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>chicago</td>\n",
       "      <td>illinois</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>portland</td>\n",
       "      <td>oregon</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>miami</td>\n",
       "      <td>florida</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city       state country\n",
       "0  san francisco  california        \n",
       "1        phoenix     arizona        \n",
       "2        seattle  washington        \n",
       "3         dallas       texas        \n",
       "4         denver    colorado        \n",
       "5        chicago    illinois        \n",
       "6       portland      oregon        \n",
       "7          miami     florida        "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can add a new column to a DataFrame\n",
    "df2['country'] = ''\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>california</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phoenix</td>\n",
       "      <td>arizona</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>seattle</td>\n",
       "      <td>washington</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>dallas</td>\n",
       "      <td>texas</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>denver</td>\n",
       "      <td>colorado</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>chicago</td>\n",
       "      <td>illinois</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>portland</td>\n",
       "      <td>oregon</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>miami</td>\n",
       "      <td>florida</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city       state country\n",
       "0  san francisco  california     USA\n",
       "1        phoenix     arizona     USA\n",
       "2        seattle  washington     USA\n",
       "3         dallas       texas     USA\n",
       "4         denver    colorado     USA\n",
       "5        chicago    illinois     USA\n",
       "6       portland      oregon     USA\n",
       "7          miami     florida     USA"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can update the values of an entire column\n",
    "df2['country'] = 'USA'\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>california</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phoenix</td>\n",
       "      <td>arizona</td>\n",
       "      <td>United States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>seattle</td>\n",
       "      <td>washington</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>dallas</td>\n",
       "      <td>texas</td>\n",
       "      <td>United States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>denver</td>\n",
       "      <td>colorado</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>chicago</td>\n",
       "      <td>illinois</td>\n",
       "      <td>United States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>portland</td>\n",
       "      <td>oregon</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>miami</td>\n",
       "      <td>florida</td>\n",
       "      <td>United States</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city       state        country\n",
       "0  san francisco  california            USA\n",
       "1        phoenix     arizona  United States\n",
       "2        seattle  washington            USA\n",
       "3         dallas       texas  United States\n",
       "4         denver    colorado            USA\n",
       "5        chicago    illinois  United States\n",
       "6       portland      oregon            USA\n",
       "7          miami     florida  United States"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can set the values of a column (aka, Series) in the DataFrame to a list of values\n",
    "df2['country'] = ['USA', 'United States'] * 4\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>california</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phoenix</td>\n",
       "      <td>arizona</td>\n",
       "      <td>United States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>seattle</td>\n",
       "      <td>washington</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>dallas</td>\n",
       "      <td>texas</td>\n",
       "      <td>United States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>denver</td>\n",
       "      <td>colorado</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>chicago</td>\n",
       "      <td>illinois</td>\n",
       "      <td>United States</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>portland</td>\n",
       "      <td>oregon</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>miami</td>\n",
       "      <td>florida</td>\n",
       "      <td>United States</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city       state        country\n",
       "0  san francisco  california            USA\n",
       "1        phoenix     arizona  United States\n",
       "2        seattle  washington            USA\n",
       "3         dallas       texas  United States\n",
       "4         denver    colorado            USA\n",
       "5        chicago    illinois  United States\n",
       "6       portland      oregon            USA\n",
       "7          miami     florida  United States"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can use fast vectorized methods on a pandas series (aka, a column in our dataframe)\n",
    "df2['country'].str.replace('United States', 'USA')\n",
    "df2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "that didn't do anything to our dataframebecause .str.replace() returns the updated version - it doesn't perform the operation in place"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>california</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phoenix</td>\n",
       "      <td>arizona</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>seattle</td>\n",
       "      <td>washington</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>dallas</td>\n",
       "      <td>texas</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>denver</td>\n",
       "      <td>colorado</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>chicago</td>\n",
       "      <td>illinois</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>portland</td>\n",
       "      <td>oregon</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>miami</td>\n",
       "      <td>florida</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            city       state country\n",
       "0  san francisco  california     USA\n",
       "1        phoenix     arizona     USA\n",
       "2        seattle  washington     USA\n",
       "3         dallas       texas     USA\n",
       "4         denver    colorado     USA\n",
       "5        chicago    illinois     USA\n",
       "6       portland      oregon     USA\n",
       "7          miami     florida     USA"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we need to capture the updated values when they get returned\n",
    "df2['country'] = df2['country'].str.replace('United States', 'USA')\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city_name</th>\n",
       "      <th>state_name</th>\n",
       "      <th>nation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>san francisco</td>\n",
       "      <td>california</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phoenix</td>\n",
       "      <td>arizona</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>seattle</td>\n",
       "      <td>washington</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>dallas</td>\n",
       "      <td>texas</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>denver</td>\n",
       "      <td>colorado</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>chicago</td>\n",
       "      <td>illinois</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>portland</td>\n",
       "      <td>oregon</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>miami</td>\n",
       "      <td>florida</td>\n",
       "      <td>USA</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       city_name  state_name nation\n",
       "0  san francisco  california    USA\n",
       "1        phoenix     arizona    USA\n",
       "2        seattle  washington    USA\n",
       "3         dallas       texas    USA\n",
       "4         denver    colorado    USA\n",
       "5        chicago    illinois    USA\n",
       "6       portland      oregon    USA\n",
       "7          miami     florida    USA"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can change the column names\n",
    "df2.columns = ['city_name', 'state_name', 'nation']\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# you can save your DataFrame as a csv file\n",
    "df2.to_csv('data/my_data.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Part 3:<br />Dropping and editing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>year</th>\n",
       "      <th>country</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sf</td>\n",
       "      <td>2012.0</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phx</td>\n",
       "      <td>NaN</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>phx</td>\n",
       "      <td>2005.0</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chi</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  city    year country      continent\n",
       "0   sf  2012.0     USA  North America\n",
       "1  phx     NaN     USA  North America\n",
       "2  phx  2005.0     USA  North America\n",
       "3  chi  2009.0     USA  North America"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# there are lots of ways to create dataframes\n",
    "list_of_tuples = [('sf', 2012), ('phx', np.nan), ('phx', 2005), ('chi', 2009)]\n",
    "df = pd.DataFrame(list_of_tuples, columns=['city', 'year'])\n",
    "df['country'] = 'USA'\n",
    "df['continent'] = 'North America'\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>year</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sf</td>\n",
       "      <td>2012.0</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phx</td>\n",
       "      <td>NaN</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>phx</td>\n",
       "      <td>2005.0</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chi</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  city    year      continent\n",
       "0   sf  2012.0  North America\n",
       "1  phx     NaN  North America\n",
       "2  phx  2005.0  North America\n",
       "3  chi  2009.0  North America"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can remove a column from a dataframe with the .drop() method by referencing its label and axis\n",
    "# axis 0 = rows\n",
    "# axis 1 = columns\n",
    "df2 = df.drop('country', axis=1, inplace=False)\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sf</td>\n",
       "      <td>2012.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phx</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>phx</td>\n",
       "      <td>2005.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chi</td>\n",
       "      <td>2009.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  city    year\n",
       "0   sf  2012.0\n",
       "1  phx     NaN\n",
       "2  phx  2005.0\n",
       "3  chi  2009.0"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can also remove a column from a dataframe with the del() function\n",
    "df3 = pd.DataFrame(df2)\n",
    "del(df3['continent'])\n",
    "df3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "del() is a function but .drop() is a method. The difference is that a method is performed on an object, in this case a DataFrame. A function is independent of an object - it's just a chunk of code that is called by name and allows data to be passed to it by arguments."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can use the len() function to check the row count of a DataFrame\n",
    "len(df3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "city    4\n",
       "year    3\n",
       "dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can also use .count() method to check the row count, but this excludes NaNs\n",
    "df3.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4, 2)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# or you can use the .shape attribute to get the shape of the DataFrame\n",
    "df3.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "An attribute is different than a method or a function. Notice it doesn't use parentheses like .shape()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "phx    2\n",
       "chi    1\n",
       "sf     1\n",
       "Name: city, dtype: int64"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can get a count of values that appear in some column\n",
    "df3['city'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sf</td>\n",
       "      <td>2012.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phx</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chi</td>\n",
       "      <td>2009.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  city    year\n",
       "0   sf  2012.0\n",
       "1  phx     NaN\n",
       "3  chi  2009.0"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can drop rows that contain duplicate values in some specified column\n",
    "df4 = df3.drop_duplicates('city')\n",
    "df4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "look at the index above. remember that it's not a row counter, it's an index of row labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>year</th>\n",
       "      <th>country</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sf</td>\n",
       "      <td>2012.0</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phx</td>\n",
       "      <td>NaN</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>phx</td>\n",
       "      <td>2005.0</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chi</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  city    year country      continent\n",
       "0   sf  2012.0     USA  North America\n",
       "1  phx     NaN     USA  North America\n",
       "2  phx  2005.0     USA  North America\n",
       "3  chi  2009.0     USA  North America"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# back to our earlier dataframe\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>year</th>\n",
       "      <th>country</th>\n",
       "      <th>continent</th>\n",
       "      <th>year5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sf</td>\n",
       "      <td>2012.0</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "      <td>2017.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>phx</td>\n",
       "      <td>NaN</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>phx</td>\n",
       "      <td>2005.0</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "      <td>2010.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>chi</td>\n",
       "      <td>2009.0</td>\n",
       "      <td>USA</td>\n",
       "      <td>North America</td>\n",
       "      <td>2014.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  city    year country      continent   year5\n",
       "0   sf  2012.0     USA  North America  2017.0\n",
       "1  phx     NaN     USA  North America     NaN\n",
       "2  phx  2005.0     USA  North America  2010.0\n",
       "3  chi  2009.0     USA  North America  2014.0"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# you can perform operations across an entire Series (aka column in our DataFrame) at once\n",
    "df['year5'] = df['year'] + 5\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
